The purpose of this document is to benchmark the big data systems against each other. Here are the properties of the machine that is benchmarking the code.

sysctl -n machdep.cpu.brand_string
printf -v a "Number of processors: %s" $(sysctl -n hw.ncpu)
echo $a
## Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
## Number of processors: 8

Required Packages

We will install the needed packages from the other analyses. Even though a package “bench” exists, we will use the package “microbenchmark” due to it not being tidy-dependent. Also, “bench” requires the output of the benchmarked functions to return the exact same object, which is a bit overly strict for our purposes.

options(repos = "https://cran.rstudio.com/",
        warn  = -1,
        stringsAsFactors = FALSE)

cur_pkgs <- rownames(installed.packages())
req_pkgs <- c("tidyverse",
              "here",
              "vroom",
              "magrittr",
              "data.table",
              "microbenchmark")

# determine packages that are missing
miss_pkgs <- setdiff(x = req_pkgs,
                     y = cur_pkgs)

# installing missing packages
if(length(miss_pkgs)){
  install.packages(miss_pkgs)
}

# load all of the relevant packages
library(magrittr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(microbenchmark)

Let’s also put a function that will run the benchmark for everything and suppress warnings/messages.

run_benchmark <- function(..., unit = "ms"){
  require(microbenchmark)
  
  # fixing call by adding microbenchmark
  cur_call      <- match.call()
  cur_call[[1]] <- as.name("microbenchmark")
  
  # running and suppressing messages/warnings
  bnch          <- suppressWarnings(
    suppressMessages(
      eval(cur_call, envir = parent.frame())
    )
  )
  
  # creating list of benchmark/summary/plot
  list(benchmark = bnch,
       default   = summary(bnch, unit = unit),
       relative  = summary(bnch, unit = "relative"),
       plot      = autoplot(bnch))
}

Let’s also put a function that will plot benchmarks if we are combining a bunch of benchmarks in a list.

plot_benchmarks <- function(bnch_list,
                            id   = "percent_missing",
                            unit = "relative",
                            use  = "median"){
  
  # get data and put in data.frame
  dfs <- lapply(bnch_list,
                FUN = "[[",
                "benchmark") %>%
         lapply(FUN  = summary,
                unit = unit) %>%
         bind_rows(.id = id)
  
  # make sure the levels are in order
  dfs[[id]] %<>% factor(levels = unique(.))
  
  # make simple barplot
  ggplot(data    = dfs,
         mapping = aes(x    = !!sym(id),
                       y    = !!sym(use),
                       fill = expr)) +
  geom_col(position = position_dodge()) +
  theme_minimal()
}

Loading Data

Indicate the current directory R:

project_dir <- here::here()
data_dir    <- file.path(project_dir, "data")

Benchmarks for small files:

small_file      <- file.path(data_dir, "demos_to_merge.csv")
bnch_read_small <- run_benchmark(
  read.csv(small_file, header = TRUE),
  read_csv(small_file),
  vroom::vroom(small_file),
  fread(small_file)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_read_small
## $benchmark
## Unit: milliseconds
##                                 expr       min        lq      mean    median
##  read.csv(small_file, header = TRUE)  1.282216  1.374542  1.453796  1.421444
##                 read_csv(small_file) 15.345299 15.997121 18.301205 16.627820
##             vroom::vroom(small_file)  8.648039  9.088017 10.267576  9.398193
##                    fread(small_file)  1.018439  1.224679  1.342771  1.357349
##         uq        max neval
##   1.505419   2.773271   100
##  17.090287 153.541067   100
##   9.718265  81.681823   100
##   1.426810   3.170288   100
## 
## $default
##                                  expr       min        lq      mean    median
## 1 read.csv(small_file, header = TRUE)  1.282216  1.374542  1.453796  1.421444
## 2                read_csv(small_file) 15.345299 15.997121 18.301205 16.627820
## 3            vroom::vroom(small_file)  8.648039  9.088017 10.267576  9.398193
## 4                   fread(small_file)  1.018439  1.224679  1.342771  1.357349
##          uq        max neval
## 1  1.505419   2.773271   100
## 2 17.090287 153.541067   100
## 3  9.718265  81.681823   100
## 4  1.426810   3.170288   100
## 
## $relative
##                                  expr       min        lq      mean   median
## 1 read.csv(small_file, header = TRUE)  1.259001  1.122369  1.082683  1.04722
## 2                read_csv(small_file) 15.067470 13.062297 13.629428 12.25021
## 3            vroom::vroom(small_file)  8.491465  7.420734  7.646556  6.92393
## 4                   fread(small_file)  1.000000  1.000000  1.000000  1.00000
##          uq        max neval
## 1  1.055095  0.8747694   100
## 2 11.977974 48.4312678   100
## 3  6.811186 25.7647958   100
## 4  1.000000  1.0000000   100
## 
## $plot

Benchmarks for large files:

large_file      <- file.path(data_dir, "master_data_20210315.csv")
bnch_read_large <- run_benchmark(
  read.csv(large_file, header = TRUE),
  read_csv(large_file),
  fread(large_file),
  vroom::vroom(large_file)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_read_large
## $benchmark
## Unit: milliseconds
##                                 expr        min         lq       mean
##  read.csv(large_file, header = TRUE) 1115.27395 1208.69211 1364.37843
##                 read_csv(large_file)  351.41107  387.80298  451.24115
##                    fread(large_file)  104.79205  118.67855  155.92661
##             vroom::vroom(large_file)   28.79996   30.26235   32.98453
##      median        uq        max neval
##  1323.45623 1444.0462 2579.52315   100
##   416.46413  478.0314  753.24136   100
##   124.54401  170.5779  463.35285   100
##    32.30613   34.5490   51.23582   100
## 
## $default
##                                  expr        min         lq       mean
## 1 read.csv(large_file, header = TRUE) 1115.27395 1208.69211 1364.37843
## 2                read_csv(large_file)  351.41107  387.80298  451.24115
## 3                   fread(large_file)  104.79205  118.67855  155.92661
## 4            vroom::vroom(large_file)   28.79996   30.26235   32.98453
##       median        uq        max neval
## 1 1323.45623 1444.0462 2579.52315   100
## 2  416.46413  478.0314  753.24136   100
## 3  124.54401  170.5779  463.35285   100
## 4   32.30613   34.5490   51.23582   100
## 
## $relative
##                                  expr       min        lq      mean   median
## 1 read.csv(large_file, header = TRUE) 38.724840 39.940454 41.364189 40.96611
## 2                read_csv(large_file) 12.201789 12.814700 13.680387 12.89118
## 3                   fread(large_file)  3.638618  3.921656  4.727265  3.85512
## 4            vroom::vroom(large_file)  1.000000  1.000000  1.000000  1.00000
##          uq       max neval
## 1 41.797048 50.346089   100
## 2 13.836332 14.701460   100
## 3  4.937274  9.043533   100
## 4  1.000000  1.000000   100
## 
## $plot

Let’s read in the small set of data and the large set of data and divide it so that it can be used to benchmark what follows. Note that we’re not doing super-systematic benchmarks, just an example of large data and small data.

read_data <- function(path){
  read.csv(path)[-1]
}

# 1. READING SMALL #

# we will use read.csv so as not to pick a fight with tidy vs data.table people
small_demos    <- read_data(file.path(data_dir, "demos_to_merge.csv"))
small_scores   <- read_data(file.path(data_dir, "scores_to_merge.csv"))
small_comb_1   <- read_data(file.path(data_dir, "data_to_rowbind.csv"))

# creating additional combination data for benchmarking
small_comb_2   <- merge(small_demos, small_scores)
small_comb_all <- rbind(small_comb_1,
                        small_comb_2)

# 2. READING LARGE #

large_comb_all <- read_data(large_file)

# 3. DIVING LARGE #

# dividing large_comb_all into several sets
n_rows_large   <- nrow(large_comb_all)
merge_idx      <- seq_len(ceiling(n_rows_large / 5))

large_comb_1   <- large_comb_all[-merge_idx, ]
large_comb_2   <- large_comb_all[merge_idx, ]
large_demos    <- large_comb_2[names(small_demos)]
large_scores   <- large_comb_2[names(small_scores)]

Merging Data

Combine Variables

First, let’s have some preliminary objects. 1. The variable marking the “ID” columns in which we are going to merge. 2. A function to automatically perform the merges and return useful results.

id_var     <- "guid"
test_joins <- function(x, y,
                       by   = "guid",
                       keep = 1){
  
  # make sure keep is length 2
  keep    <- pmax(0, pmin(1, rep_len(keep, 2)))
  
  # sampling rows of x and y to match keep
  x_and_y <- Map(
    f   = function(df, prob){
      n <- nrow(df)
      df[sample.int(n, size = ceiling(prob * n)), , drop = FALSE]
    },
    df   = list(x = x, y = y),
    prob = keep
  )
  
  # pulling out x and y for purposes of merging
  x       <- x_and_y$x
  y       <- x_and_y$y
  
  
  # turning into data.tables and setting keys
  x_dt   <- as.data.table(x)
  y_dt   <- as.data.table(y)
  
  setkeyv(x_dt, by)
  setkeyv(y_dt, by)
  
  # running inner/left/right/outer
  bnch_inner <- run_benchmark(
    merge(x, y, by = by, all = FALSE),
    inner_join(x, y, by = by),
    merge(x_dt, y_dt, by = by, all = FALSE),
    x_dt[y_dt, nomatch = 0]
  )

  bnch_left <- run_benchmark(
    merge(x, y, by = by, all.x = TRUE),
    left_join(x, y, by = by),
    merge(x_dt, y_dt, by = by, all.x = TRUE),
    x_dt[y_dt]
  )
  
  bnch_right <- run_benchmark(
    merge(x, y, by = by, all.y = TRUE),
    right_join(x, y, by = by),
    merge(x_dt, y_dt, by = by, all.y = TRUE),
    y_dt[x_dt]
  )
  
  bnch_outer <- run_benchmark(
    merge(x, y, by = by, all = TRUE),
    full_join(x, y, by = by),
    merge(x_dt, y_dt, by = by, all = TRUE)
  )
  
  list(inner = bnch_inner,
       left  = bnch_left,
       right = bnch_right,
       outer = bnch_outer)
}

Benchmarks for small data:

bnch_join_small_1 <- test_joins(x  = small_demos,
                                y  = small_scores,
                                by = id_var)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Inner
bnch_join_small_1$inner
## $benchmark
## Unit: microseconds
##                                     expr      min       lq      mean    median
##        merge(x, y, by = by, all = FALSE) 1598.126 1691.903 1801.2448 1812.8450
##                inner_join(x, y, by = by) 1636.832 1820.959 2040.8562 1982.9700
##  merge(x_dt, y_dt, by = by, all = FALSE) 1096.366 1178.578 1519.7472 1303.2040
##                  x_dt[y_dt, nomatch = 0]  708.224  735.743  910.1826  844.9145
##         uq       max neval
##  1899.7190  2342.695   100
##  2140.9245  7065.595   100
##  1392.3395 22953.444   100
##   875.5345  9050.191   100
## 
## $default
##                                      expr      min       lq      mean    median
## 1       merge(x, y, by = by, all = FALSE) 1.598126 1.691903 1.8012448 1.8128450
## 2               inner_join(x, y, by = by) 1.636832 1.820959 2.0408562 1.9829700
## 3 merge(x_dt, y_dt, by = by, all = FALSE) 1.096366 1.178578 1.5197472 1.3032040
## 4                 x_dt[y_dt, nomatch = 0] 0.708224 0.735743 0.9101826 0.8449145
##          uq       max neval
## 1 1.8997190  2.342695   100
## 2 2.1409245  7.065595   100
## 3 1.3923395 22.953444   100
## 4 0.8755345  9.050191   100
## 
## $relative
##                                      expr      min       lq     mean   median
## 1       merge(x, y, by = by, all = FALSE) 2.256526 2.299584 1.978993 2.145596
## 2               inner_join(x, y, by = by) 2.311178 2.474993 2.242249 2.346948
## 3 merge(x_dt, y_dt, by = by, all = FALSE) 1.548050 1.601888 1.669717 1.542409
## 4                 x_dt[y_dt, nomatch = 0] 1.000000 1.000000 1.000000 1.000000
##         uq       max neval
## 1 2.169782 0.2588559   100
## 2 2.445277 0.7807123   100
## 3 1.590274 2.5362386   100
## 4 1.000000 1.0000000   100
## 
## $plot

# Left
bnch_join_small_1$left
## $benchmark
## Unit: microseconds
##                                      expr      min       lq      mean    median
##        merge(x, y, by = by, all.x = TRUE) 1584.065 1668.655 3000.9129 1717.7825
##                  left_join(x, y, by = by) 1621.288 1759.643 1880.9304 1841.5825
##  merge(x_dt, y_dt, by = by, all.x = TRUE) 1097.691 1171.461 1220.1517 1211.9225
##                                x_dt[y_dt]  716.161  741.600  777.6728  755.3995
##         uq        max neval
##  1770.6570 128892.856   100
##  1976.4255   2565.177   100
##  1249.1685   1405.529   100
##   800.0455   1051.161   100
## 
## $default
##                                       expr      min       lq      mean
## 1       merge(x, y, by = by, all.x = TRUE) 1.584065 1.668655 3.0009129
## 2                 left_join(x, y, by = by) 1.621288 1.759643 1.8809304
## 3 merge(x_dt, y_dt, by = by, all.x = TRUE) 1.097691 1.171460 1.2201517
## 4                               x_dt[y_dt] 0.716161 0.741600 0.7776728
##      median        uq        max neval
## 1 1.7177825 1.7706570 128.892856   100
## 2 1.8415825 1.9764255   2.565177   100
## 3 1.2119225 1.2491685   1.405529   100
## 4 0.7553995 0.8000455   1.051161   100
## 
## $relative
##                                       expr      min       lq     mean   median
## 1       merge(x, y, by = by, all.x = TRUE) 2.211884 2.250075 3.858837 2.274005
## 2                 left_join(x, y, by = by) 2.263860 2.372766 2.418665 2.437892
## 3 merge(x_dt, y_dt, by = by, all.x = TRUE) 1.532743 1.579639 1.568978 1.604346
## 4                               x_dt[y_dt] 1.000000 1.000000 1.000000 1.000000
##         uq        max neval
## 1 2.213195 122.619519   100
## 2 2.470391   2.440327   100
## 3 1.561372   1.337121   100
## 4 1.000000   1.000000   100
## 
## $plot

# Right
bnch_join_small_1$right
## $benchmark
## Unit: microseconds
##                                      expr      min        lq      mean
##        merge(x, y, by = by, all.y = TRUE) 1589.690 1667.0275 1707.8532
##                 right_join(x, y, by = by) 1650.160 1833.8985 1928.8905
##  merge(x_dt, y_dt, by = by, all.y = TRUE) 1643.661 1728.8990 2092.8514
##                                y_dt[x_dt]  705.300  733.5985  767.8416
##     median        uq       max neval
##  1694.9705 1732.2710  1933.605   100
##  1901.2605 2002.3065  2289.361   100
##  1783.7465 1883.4460 30080.406   100
##   754.3565  790.9835   882.657   100
## 
## $default
##                                       expr      min        lq      mean
## 1       merge(x, y, by = by, all.y = TRUE) 1.589690 1.6670275 1.7078532
## 2                right_join(x, y, by = by) 1.650160 1.8338985 1.9288905
## 3 merge(x_dt, y_dt, by = by, all.y = TRUE) 1.643661 1.7288990 2.0928514
## 4                               y_dt[x_dt] 0.705300 0.7335985 0.7678416
##      median        uq       max neval
## 1 1.6949705 1.7322710  1.933605   100
## 2 1.9012605 2.0023065  2.289361   100
## 3 1.7837465 1.8834460 30.080406   100
## 4 0.7543565 0.7909835  0.882657   100
## 
## $relative
##                                       expr      min       lq     mean   median
## 1       merge(x, y, by = by, all.y = TRUE) 2.253920 2.272398 2.224226 2.246909
## 2                right_join(x, y, by = by) 2.339657 2.499867 2.512094 2.520374
## 3 merge(x_dt, y_dt, by = by, all.y = TRUE) 2.330442 2.356737 2.725629 2.364594
## 4                               y_dt[x_dt] 1.000000 1.000000 1.000000 1.000000
##         uq       max neval
## 1 2.190022  2.190664   100
## 2 2.531414  2.593715   100
## 3 2.381144 34.079383   100
## 4 1.000000  1.000000   100
## 
## $plot

# Outer
bnch_join_small_1$outer
## $benchmark
## Unit: milliseconds
##                                    expr      min       lq     mean   median
##        merge(x, y, by = by, all = TRUE) 1.587842 1.629756 1.888675 1.653839
##                full_join(x, y, by = by) 1.653139 1.691276 1.820628 1.798546
##  merge(x_dt, y_dt, by = by, all = TRUE) 1.591998 1.640868 1.729458 1.704767
##        uq       max neval
##  1.683472 24.432939   100
##  1.929545  2.236942   100
##  1.776987  2.286940   100
## 
## $default
##                                     expr      min       lq     mean   median
## 1       merge(x, y, by = by, all = TRUE) 1.587842 1.629756 1.888675 1.653839
## 2               full_join(x, y, by = by) 1.653139 1.691276 1.820628 1.798546
## 3 merge(x_dt, y_dt, by = by, all = TRUE) 1.591998 1.640868 1.729458 1.704767
##         uq       max neval
## 1 1.683472 24.432939   100
## 2 1.929545  2.236942   100
## 3 1.776987  2.286940   100
## 
## $relative
##                                     expr      min       lq      mean   median
## 1       merge(x, y, by = by, all = TRUE) 1.000000 1.000000 1.0000000 1.000000
## 2               full_join(x, y, by = by) 1.041123 1.037748 0.9639708 1.087498
## 3 merge(x_dt, y_dt, by = by, all = TRUE) 1.002617 1.006818 0.9156988 1.030794
##         uq        max neval
## 1 1.000000 1.00000000   100
## 2 1.146170 0.09155436   100
## 3 1.055549 0.09360069   100
## 
## $plot

Benchmarks for large data:

bnch_join_large_1 <- test_joins(x  = large_demos,
                                y  = large_scores,
                                by = id_var)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Inner
bnch_join_large_1$inner
## $benchmark
## Unit: milliseconds
##                                     expr       min        lq      mean
##        merge(x, y, by = by, all = FALSE) 85.121168 85.892314 89.186439
##                inner_join(x, y, by = by)  7.043479  7.509643  9.531737
##  merge(x_dt, y_dt, by = by, all = FALSE)  5.374240  5.989276  6.385281
##                  x_dt[y_dt, nomatch = 0]  4.633596  5.405561  7.420612
##     median        uq       max neval
##  86.604356 88.754589 118.10214   100
##   7.674637  8.049369  40.90420   100
##   6.109198  6.267143  31.83852   100
##   5.587108  5.764193  36.98498   100
## 
## $default
##                                      expr       min        lq      mean
## 1       merge(x, y, by = by, all = FALSE) 85.121168 85.892314 89.186439
## 2               inner_join(x, y, by = by)  7.043479  7.509643  9.531737
## 3 merge(x_dt, y_dt, by = by, all = FALSE)  5.374240  5.989276  6.385281
## 4                 x_dt[y_dt, nomatch = 0]  4.633596  5.405561  7.420612
##      median        uq       max neval
## 1 86.604356 88.754589 118.10214   100
## 2  7.674637  8.049369  40.90420   100
## 3  6.109198  6.267143  31.83852   100
## 4  5.587108  5.764193  36.98498   100
## 
## $relative
##                                      expr       min        lq       mean
## 1       merge(x, y, by = by, all = FALSE) 18.370434 15.889621 12.0187450
## 2               inner_join(x, y, by = by)  1.520089  1.389244  1.2844948
## 3 merge(x_dt, y_dt, by = by, all = FALSE)  1.159842  1.107984  0.8604791
## 4                 x_dt[y_dt, nomatch = 0]  1.000000  1.000000  1.0000000
##      median        uq       max neval
## 1 15.500749 15.397575 3.1932455   100
## 2  1.373633  1.396443 1.1059677   100
## 3  1.093445  1.087254 0.8608499   100
## 4  1.000000  1.000000 1.0000000   100
## 
## $plot

# Left
bnch_join_large_1$left
## $benchmark
## Unit: milliseconds
##                                      expr       min        lq      mean
##        merge(x, y, by = by, all.x = TRUE) 85.293129 85.735330 91.044625
##                  left_join(x, y, by = by)  6.978856  7.446243  8.954506
##  merge(x_dt, y_dt, by = by, all.x = TRUE)  5.229084  5.625960  6.255178
##                                x_dt[y_dt]  4.401157  5.101157  5.763486
##     median        uq       max neval
##  86.110446 86.696562 209.64710   100
##   7.627331  7.781408  41.31165   100
##   5.754943  5.894331  30.65941   100
##   5.257812  5.315555  33.60808   100
## 
## $default
##                                       expr       min        lq      mean
## 1       merge(x, y, by = by, all.x = TRUE) 85.293129 85.735330 91.044625
## 2                 left_join(x, y, by = by)  6.978856  7.446243  8.954506
## 3 merge(x_dt, y_dt, by = by, all.x = TRUE)  5.229084  5.625960  6.255178
## 4                               x_dt[y_dt]  4.401157  5.101157  5.763486
##      median        uq       max neval
## 1 86.110446 86.696562 209.64710   100
## 2  7.627331  7.781408  41.31165   100
## 3  5.754943  5.894331  30.65941   100
## 4  5.257812  5.315555  33.60808   100
## 
## $relative
##                                       expr       min        lq      mean
## 1       merge(x, y, by = by, all.x = TRUE) 19.379706 16.807034 15.796797
## 2                 left_join(x, y, by = by)  1.585687  1.459716  1.553661
## 3 merge(x_dt, y_dt, by = by, all.x = TRUE)  1.188116  1.102879  1.085312
## 4                               x_dt[y_dt]  1.000000  1.000000  1.000000
##      median        uq       max neval
## 1 16.377618 16.309974 6.2379975   100
## 2  1.450666  1.463894 1.2292179   100
## 3  1.094551  1.108884 0.9122632   100
## 4  1.000000  1.000000 1.0000000   100
## 
## $plot

# Right
bnch_join_large_1$right
## $benchmark
## Unit: milliseconds
##                                      expr       min        lq      mean
##        merge(x, y, by = by, all.y = TRUE) 85.130469 85.666714 90.357069
##                 right_join(x, y, by = by)  7.723282  8.531254  9.527223
##  merge(x_dt, y_dt, by = by, all.y = TRUE)  7.670457  9.085331 10.313555
##                                y_dt[x_dt]  4.050937  5.052690  5.671638
##     median        uq       max neval
##  86.091155 87.262653 118.58536   100
##   8.703039  8.956814  38.54915   100
##   9.281391  9.446704  41.19841   100
##   5.247997  5.385542  32.72982   100
## 
## $default
##                                       expr       min        lq      mean
## 1       merge(x, y, by = by, all.y = TRUE) 85.130469 85.666714 90.357069
## 2                right_join(x, y, by = by)  7.723282  8.531254  9.527223
## 3 merge(x_dt, y_dt, by = by, all.y = TRUE)  7.670457  9.085331 10.313555
## 4                               y_dt[x_dt]  4.050937  5.052690  5.671638
##      median        uq       max neval
## 1 86.091155 87.262653 118.58536   100
## 2  8.703039  8.956814  38.54915   100
## 3  9.281391  9.446704  41.19841   100
## 4  5.247997  5.385542  32.72982   100
## 
## $relative
##                                       expr       min        lq      mean
## 1       merge(x, y, by = by, all.y = TRUE) 21.015007 16.954673 15.931389
## 2                right_join(x, y, by = by)  1.906542  1.688458  1.679801
## 3 merge(x_dt, y_dt, by = by, all.y = TRUE)  1.893502  1.798117  1.818444
## 4                               y_dt[x_dt]  1.000000  1.000000  1.000000
##      median        uq      max neval
## 1 16.404574 16.203131 3.623159   100
## 2  1.658355  1.663122 1.177799   100
## 3  1.768559  1.754086 1.258742   100
## 4  1.000000  1.000000 1.000000   100
## 
## $plot

# Outer
bnch_join_large_1$outer
## $benchmark
## Unit: milliseconds
##                                    expr       min        lq      mean    median
##        merge(x, y, by = by, all = TRUE) 84.733308 85.510103 88.351556 85.882728
##                full_join(x, y, by = by)  8.155158  8.506530 10.870472  8.690476
##  merge(x_dt, y_dt, by = by, all = TRUE)  8.190572  8.767346  9.759042  9.013307
##         uq       max neval
##  86.452453 115.47279   100
##   8.978945  38.10126   100
##   9.179148  41.56250   100
## 
## $default
##                                     expr       min        lq      mean
## 1       merge(x, y, by = by, all = TRUE) 84.733308 85.510103 88.351556
## 2               full_join(x, y, by = by)  8.155158  8.506530 10.870472
## 3 merge(x_dt, y_dt, by = by, all = TRUE)  8.190572  8.767346  9.759042
##      median        uq       max neval
## 1 85.882728 86.452453 115.47279   100
## 2  8.690476  8.978945  38.10126   100
## 3  9.013307  9.179148  41.56250   100
## 
## $relative
##                                     expr       min        lq     mean   median
## 1       merge(x, y, by = by, all = TRUE) 10.390149 10.052290 8.127665 9.882396
## 2               full_join(x, y, by = by)  1.000000  1.000000 1.000000 1.000000
## 3 merge(x_dt, y_dt, by = by, all = TRUE)  1.004343  1.030661 0.897757 1.037148
##         uq      max neval
## 1 9.628353 3.030682   100
## 2 1.000000 1.000000   100
## 3 1.022297 1.090843   100
## 
## $plot

We can look at what happens when we have a certain proportion of the full data (with not all rows the same).

set.seed(888)
keep_perc            <- setNames(nm = seq(.2, .8, by = .2))

# running benchmarks
bnch_join_large_2    <- lapply(X   = keep_perc,
                               FUN = test_joins,
                               x   = large_demos,
                               y   = large_scores,
                               by  = id_var)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# rearranging outer and inner lists
bnch_join_large_2    <- do.call(what = Map,
                                args = c(list(f = list), bnch_join_large_2))

# creating plots of all of the benchmarks
bnch_join_large_2_gg <- lapply(X    = bnch_join_large_2,
                               FUN  = plot_benchmarks,
                               id   = "percent_missing",
                               unit = "relative",
                               use  = "median")

# Inner
bnch_join_large_2_gg$inner

# Left
bnch_join_large_2_gg$left

# Right
bnch_join_large_2_gg$right

# Outer
bnch_join_large_2_gg$outer

Combine People

First, let’s have some preliminary objects. 1. A function to automatically splits the data, binds the data, and returns useful results.

test_binds <- function(x,
                       n_splits = 2){

  # splitting data
  x    <- split.data.frame(x = x,
                           f = rep_len(seq_len(n_splits), nrow(x)))
  
  # binding everything together again
  run_benchmark(
    do.call(rbind, x),
    bind_rows(x),
    rbindlist(x)
  )
}

Specifying the total number of groups we always want to compare.

n_groups        <- setNames(nm = seq(2, 102, by = 10))

Benchmarks for small data:

bnch_bind_small <- lapply(X   = n_groups,
                          FUN = test_binds,
                          x   = small_comb_all)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
plot_benchmarks(
  bnch_list = bnch_bind_small,
  id        = "n_groups",
  unit      = "relative",
  use       = "median"
)

Benchmarks for large data:

bnch_bind_large <- lapply(X   = n_groups,
                          FUN = test_binds,
                          x   = large_comb_all)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
plot_benchmarks(
  bnch_list = bnch_bind_large,
  id        = "n_groups",
  unit      = "relative",
  use       = "median"
)

Reshaping Data

Let’s have some preliminary objects across all of the reshaping. 1. The name of the variable column and value column 2. The entries in the value column (for wide data).

var_name  <- "trait"
val_name  <- "normed_score"
var_vals  <- grep(x       = names(small_scores),
                  pattern = "^X[0-9]+",
                  value   = TRUE)

Wide to Long

We can add functions to quickly reshape data and make it easier to apply.

wide_to_long_base <- function(df){
  reshape(df,
          varying   = var_vals,
          v.names   = val_name,
          timevar   = var_name,
          idvar     = names(small_demos),
          direction = "long")
}

wide_to_long_tidy_gather <- function(df){
  gather(df,
         key    = !!var_name,
         value  = !!val_name,
         one_of(var_vals))
}

wide_to_long_tidy_pivot <- function(df){
  pivot_longer(df,
               cols      = var_vals,
               names_to  = var_name,
               values_to = val_name)
}

wide_to_long_dt   <- function(df){
  melt(as.data.table(df),
       measure.vars  = var_vals,
       variable.name = var_name,
       value.name    = val_name)
}

Benchmarks for small data:

bnch_shape_long_small <- run_benchmark(
  wide_to_long_base(small_comb_all),
  wide_to_long_tidy_gather(small_comb_all),
  wide_to_long_tidy_pivot(small_comb_all),
  wide_to_long_dt(small_comb_all)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_shape_long_small
## $benchmark
## Unit: microseconds
##                                      expr       min         lq       mean
##         wide_to_long_base(small_comb_all) 67637.610 69233.5815 72719.6865
##  wide_to_long_tidy_gather(small_comb_all)  7435.185  7677.5060  8275.2071
##   wide_to_long_tidy_pivot(small_comb_all)  4850.805  5241.1185  5563.4792
##           wide_to_long_dt(small_comb_all)   522.808   618.4355   672.4733
##     median        uq        max neval
##  71039.671 72829.106 124009.333   100
##   7786.309  7962.420  45648.844   100
##   5459.282  5718.937  10846.045   100
##    652.936   683.615   2608.079   100
## 
## $default
##                                       expr       min         lq       mean
## 1        wide_to_long_base(small_comb_all) 67.637610 69.2335815 72.7196865
## 2 wide_to_long_tidy_gather(small_comb_all)  7.435185  7.6775060  8.2752071
## 3  wide_to_long_tidy_pivot(small_comb_all)  4.850805  5.2411185  5.5634792
## 4          wide_to_long_dt(small_comb_all)  0.522808  0.6184355  0.6724733
##      median        uq        max neval
## 1 71.039671 72.829106 124.009333   100
## 2  7.786309  7.962420  45.648844   100
## 3  5.459282  5.718937  10.846045   100
## 4  0.652936  0.683615   2.608079   100
## 
## $relative
##                                       expr        min         lq       mean
## 1        wide_to_long_base(small_comb_all) 129.373709 111.949559 108.137664
## 2 wide_to_long_tidy_gather(small_comb_all)  14.221636  12.414401  12.305630
## 3  wide_to_long_tidy_pivot(small_comb_all)   9.278368   8.474802   8.273161
## 4          wide_to_long_dt(small_comb_all)   1.000000   1.000000   1.000000
##       median         uq       max neval
## 1 108.800359 106.535267 47.548151   100
## 2  11.925072  11.647521 17.502861   100
## 3   8.361128   8.365728  4.158634   100
## 4   1.000000   1.000000  1.000000   100
## 
## $plot

Benchmarks for large data:

bnch_shape_long_large <- run_benchmark(
  wide_to_long_base(large_comb_all),
  wide_to_long_tidy_gather(large_comb_all),
  wide_to_long_tidy_pivot(large_comb_all),
  wide_to_long_dt(large_comb_all)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_shape_long_large
## $benchmark
## Unit: milliseconds
##                                      expr         min          lq       mean
##         wide_to_long_base(large_comb_all) 18192.39325 18939.07560 25267.0711
##  wide_to_long_tidy_gather(large_comb_all)   718.68953   725.87641   875.2249
##   wide_to_long_tidy_pivot(large_comb_all)    79.62178    88.19576   320.8779
##           wide_to_long_dt(large_comb_all)    32.82118    43.37826   162.7716
##       median          uq       max neval
##  21315.15178 29139.91710 52758.411   100
##    729.34497   737.47770  8291.349   100
##     93.06330    95.30745  7373.518   100
##     44.94552    46.47995  3040.080   100
## 
## $default
##                                       expr         min          lq       mean
## 1        wide_to_long_base(large_comb_all) 18192.39325 18939.07560 25267.0711
## 2 wide_to_long_tidy_gather(large_comb_all)   718.68953   725.87641   875.2249
## 3  wide_to_long_tidy_pivot(large_comb_all)    79.62178    88.19576   320.8779
## 4          wide_to_long_dt(large_comb_all)    32.82118    43.37826   162.7716
##        median          uq       max neval
## 1 21315.15178 29139.91710 52758.411   100
## 2   729.34497   737.47770  8291.349   100
## 3    93.06330    95.30745  7373.518   100
## 4    44.94552    46.47995  3040.080   100
## 
## $relative
##                                       expr        min         lq       mean
## 1        wide_to_long_base(large_comb_all) 554.288130 436.602934 155.230190
## 2 wide_to_long_tidy_gather(large_comb_all)  21.897123  16.733645   5.377012
## 3  wide_to_long_tidy_pivot(large_comb_all)   2.425926   2.033179   1.971338
## 4          wide_to_long_dt(large_comb_all)   1.000000   1.000000   1.000000
##      median         uq       max neval
## 1 474.24415 626.935248 17.354285   100
## 2  16.22731  15.866578  2.727346   100
## 3   2.07058   2.050507  2.425436   100
## 4   1.00000   1.000000  1.000000   100
## 
## $plot

Long to Wide

We can add functions to quickly reshape data and make it easier to apply and create long data for benchmarking purposes.

small_comb_long_all <- as.data.frame(wide_to_long_dt(small_comb_all))
large_comb_long_all <- as.data.frame(wide_to_long_dt(large_comb_all))

long_to_wide_base <- function(df){
  reshape(df,
          v.names   = val_name,
          timevar   = var_name,
          idvar     = names(small_demos),
          direction = "wide")
}

long_to_wide_tidy_gather <- function(df){
  spread(df,
         key    = !!var_name,
         value  = !!val_name)
}

long_to_wide_tidy_pivot <- function(df){
  pivot_wider(df,
              names_from  = !!var_name,
              values_from = !!val_name)
}

long_to_wide_dt   <- function(df){
  dcast(as.data.table(df),
        as.formula(paste0("... ~", var_name)),
        value.var = val_name)
}

Benchmarks for small data:

bnch_shape_wide_small <- run_benchmark(
  long_to_wide_base(small_comb_long_all),
  long_to_wide_tidy_gather(small_comb_long_all),
  long_to_wide_tidy_pivot(small_comb_long_all),
  long_to_wide_dt(small_comb_long_all)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_shape_wide_small
## $benchmark
## Unit: milliseconds
##                                           expr       min        lq      mean
##         long_to_wide_base(small_comb_long_all) 70.051042 72.544492 85.502821
##  long_to_wide_tidy_gather(small_comb_long_all)  6.403182  7.101287  7.326382
##   long_to_wide_tidy_pivot(small_comb_long_all)  3.944102  4.844976  5.077928
##           long_to_wide_dt(small_comb_long_all)  3.918392  4.743412  4.905742
##     median        uq         max neval
##  74.647927 76.616593 1165.753955   100
##   7.286821  7.474488    9.388345   100
##   5.024573  5.360321    7.031711   100
##   4.877147  5.047589    7.186544   100
## 
## $default
##                                            expr       min        lq      mean
## 1        long_to_wide_base(small_comb_long_all) 70.051042 72.544492 85.502821
## 2 long_to_wide_tidy_gather(small_comb_long_all)  6.403182  7.101287  7.326382
## 3  long_to_wide_tidy_pivot(small_comb_long_all)  3.944102  4.844976  5.077928
## 4          long_to_wide_dt(small_comb_long_all)  3.918392  4.743412  4.905742
##      median        uq         max neval
## 1 74.647927 76.616593 1165.753955   100
## 2  7.286821  7.474488    9.388345   100
## 3  5.024573  5.360321    7.031711   100
## 4  4.877147  5.047589    7.186544   100
## 
## $relative
##                                            expr       min        lq      mean
## 1        long_to_wide_base(small_comb_long_all) 17.877497 15.293735 17.429133
## 2 long_to_wide_tidy_gather(small_comb_long_all)  1.634135  1.497084  1.493430
## 3  long_to_wide_tidy_pivot(small_comb_long_all)  1.006561  1.021411  1.035099
## 4          long_to_wide_dt(small_comb_long_all)  1.000000  1.000000  1.000000
##      median        uq         max neval
## 1 15.305655 15.178849 162.2134304   100
## 2  1.494075  1.480804   1.3063783   100
## 3  1.030228  1.061957   0.9784552   100
## 4  1.000000  1.000000   1.0000000   100
## 
## $plot

Benchmarks for large data:

bnch_shape_wide_large <- run_benchmark(
  long_to_wide_base(large_comb_long_all),
  long_to_wide_tidy_gather(large_comb_long_all),
  long_to_wide_tidy_pivot(large_comb_long_all),
  long_to_wide_dt(large_comb_long_all)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_shape_wide_large
## $benchmark
## Unit: milliseconds
##                                           expr        min         lq       mean
##         long_to_wide_base(large_comb_long_all) 18574.4783 20588.2739 26376.0326
##  long_to_wide_tidy_gather(large_comb_long_all)   944.4465   982.2077  1522.8837
##   long_to_wide_tidy_pivot(large_comb_long_all)   137.2820   148.1601   316.1469
##           long_to_wide_dt(large_comb_long_all)   319.9624   331.2554   606.2003
##      median         uq       max neval
##  22432.8338 33143.0771 44232.265   100
##    999.8674  2537.2729  4187.677   100
##    150.1174   153.0628  6315.400   100
##    335.4374   341.9275  4530.148   100
## 
## $default
##                                            expr        min         lq
## 1        long_to_wide_base(large_comb_long_all) 18574.4783 20588.2739
## 2 long_to_wide_tidy_gather(large_comb_long_all)   944.4465   982.2077
## 3  long_to_wide_tidy_pivot(large_comb_long_all)   137.2820   148.1601
## 4          long_to_wide_dt(large_comb_long_all)   319.9624   331.2554
##         mean     median         uq       max neval
## 1 26376.0326 22432.8338 33143.0771 44232.265   100
## 2  1522.8837   999.8674  2537.2729  4187.677   100
## 3   316.1469   150.1174   153.0628  6315.400   100
## 4   606.2003   335.4374   341.9275  4530.148   100
## 
## $relative
##                                            expr        min         lq      mean
## 1        long_to_wide_base(large_comb_long_all) 135.301612 138.959628 83.429686
## 2 long_to_wide_tidy_gather(large_comb_long_all)   6.879608   6.629366  4.817014
## 3  long_to_wide_tidy_pivot(large_comb_long_all)   1.000000   1.000000  1.000000
## 4          long_to_wide_dt(large_comb_long_all)   2.330694   2.235793  1.917464
##       median         uq       max neval
## 1 149.435306 216.532552 7.0038735   100
## 2   6.660571  16.576680 0.6630897   100
## 3   1.000000   1.000000 1.0000000   100
## 4   2.234501   2.233903 0.7173176   100
## 
## $plot

Aggregating Data

Let’s have some preliminary objects across all of the reshaping. 1. The name of the variable column and value column 2. The entries in the value column (for wide data).

var_name  <- "trait"
val_name  <- "normed_score"
var_vals  <- grep(x       = names(small_scores),
                  pattern = "^X[0-9]+",
                  value   = TRUE)

Long Format Data

We can add functions to quickly aggregate data and make it easier to apply (using NSE to remove overhead of pasting and creating formulas).

aggr_long_base <- function(df){
  mn <- aggregate(normed_score ~ data_level + trait,
                  FUN  = mean,
                  data = df)
  sd <- aggregate(normed_score ~ data_level + trait,
                  FUN  = sd,
                  data = df)
  list(mn, sd)
}

aggr_long_tidy <- function(df){
  df %>%
    group_by(data_level, trait) %>%
    summarize(mean_score = mean(normed_score),
              sd_score   = sd(normed_score))
}

aggr_long_dt   <- function(df){
  dt <- as.data.table(df)
  dt[, .(mean_score = mean(normed_score),
         sd_score   = sd(normed_score)),
     by = .(data_level, trait)]
}

Benchmarks for small data:

bnch_aggr_long_small <- run_benchmark(
  aggr_long_base(small_comb_long_all),
  aggr_long_tidy(small_comb_long_all),
  aggr_long_dt(small_comb_long_all)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_aggr_long_small
## $benchmark
## Unit: milliseconds
##                                 expr      min       lq     mean   median
##  aggr_long_base(small_comb_long_all) 9.531267 9.752885 9.986650 9.843245
##  aggr_long_tidy(small_comb_long_all) 5.465912 5.945051 6.347606 6.285369
##    aggr_long_dt(small_comb_long_all) 1.156154 1.331351 1.451090 1.444099
##         uq       max neval
##  10.067502 12.551216   100
##   6.447125 12.352523   100
##   1.525438  4.935919   100
## 
## $default
##                                  expr      min       lq     mean   median
## 1 aggr_long_base(small_comb_long_all) 9.531267 9.752885 9.986650 9.843245
## 2 aggr_long_tidy(small_comb_long_all) 5.465912 5.945051 6.347606 6.285369
## 3   aggr_long_dt(small_comb_long_all) 1.156154 1.331351 1.451090 1.444099
##          uq       max neval
## 1 10.067502 12.551216   100
## 2  6.447125 12.352523   100
## 3  1.525438  4.935919   100
## 
## $relative
##                                  expr      min       lq     mean   median
## 1 aggr_long_base(small_comb_long_all) 8.243942 7.325553 6.882171 6.816184
## 2 aggr_long_tidy(small_comb_long_all) 4.727668 4.465426 4.374371 4.352450
## 3   aggr_long_dt(small_comb_long_all) 1.000000 1.000000 1.000000 1.000000
##         uq      max neval
## 1 6.599745 2.542833   100
## 2 4.226409 2.502578   100
## 3 1.000000 1.000000   100
## 
## $plot

Benchmarks for large data:

bnch_aggr_long_large <- run_benchmark(
  aggr_long_base(large_comb_long_all),
  aggr_long_tidy(large_comb_long_all),
  aggr_long_dt(large_comb_long_all)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_aggr_long_large
## $benchmark
## Unit: milliseconds
##                                 expr       min         lq       mean     median
##  aggr_long_base(large_comb_long_all) 947.47474 1085.45412 1103.46696 1109.38487
##  aggr_long_tidy(large_comb_long_all)  38.65261   43.65946   47.83091   44.88564
##    aggr_long_dt(large_comb_long_all)  59.31455   74.63615   81.96242   76.36451
##          uq       max neval
##  1135.51806 1440.8471   100
##    45.69506  225.3857   100
##    78.32696  252.7661   100
## 
## $default
##                                  expr       min         lq       mean
## 1 aggr_long_base(large_comb_long_all) 947.47474 1085.45412 1103.46696
## 2 aggr_long_tidy(large_comb_long_all)  38.65261   43.65946   47.83091
## 3   aggr_long_dt(large_comb_long_all)  59.31455   74.63615   81.96242
##       median         uq       max neval
## 1 1109.38487 1135.51806 1440.8471   100
## 2   44.88564   45.69506  225.3857   100
## 3   76.36451   78.32696  252.7661   100
## 
## $relative
##                                  expr       min        lq      mean    median
## 1 aggr_long_base(large_comb_long_all) 24.512571 24.861833 23.070164 24.715807
## 2 aggr_long_tidy(large_comb_long_all)  1.000000  1.000000  1.000000  1.000000
## 3   aggr_long_dt(large_comb_long_all)  1.534555  1.709507  1.713587  1.701313
##          uq      max neval
## 1 24.849911 6.392805   100
## 2  1.000000 1.000000   100
## 3  1.714123 1.121482   100
## 
## $plot

Wide Format Data

We can add functions to quickly aggregate data and make it easier to apply (using NSE to remove overhead of pasting and creating formulas).

aggr_wide_base <- function(df){
  mn <- aggregate(x   = df[var_vals],
                  by  = df[c("data_level")],
                  FUN = mean)
  sd <- aggregate(x   = df[var_vals],
                  by  = df[c("data_level")],
                  FUN = sd)
  list(mn, sd)
}

aggr_wide_tidy <- function(df){
  df %>%
    group_by(data_level) %>%
    summarize(across(.cols = all_of(var_vals),
                     .fns  = list(mean = mean,
                                  sd   = sd)))
}

aggr_wide_dt   <- function(df){
  dt       <- as.data.table(df)
  dt[, c(lapply(setNames(.SD, paste0("mean_", names(.SD))), mean),
         lapply(setNames(.SD, paste0("sd_",   names(.SD))), sd)),
     by      = data_level,
     .SDcols = var_vals]
}

Benchmarks for small data:

bnch_aggr_wide_small <- run_benchmark(
  aggr_wide_base(small_comb_all),
  aggr_wide_tidy(small_comb_all),
  aggr_wide_dt(small_comb_all)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_aggr_wide_small
## $benchmark
## Unit: milliseconds
##                            expr      min       lq     mean   median       uq
##  aggr_wide_base(small_comb_all) 4.317722 4.523416 4.664629 4.580209 4.712265
##  aggr_wide_tidy(small_comb_all) 5.958957 6.501841 7.834972 6.790501 7.117837
##    aggr_wide_dt(small_comb_all) 1.790795 1.910086 2.073120 2.006660 2.096089
##         max neval
##    9.510568   100
##  104.714960   100
##    7.889932   100
## 
## $default
##                             expr      min       lq     mean   median       uq
## 1 aggr_wide_base(small_comb_all) 4.317722 4.523416 4.664629 4.580209 4.712265
## 2 aggr_wide_tidy(small_comb_all) 5.958957 6.501841 7.834972 6.790501 7.117837
## 3   aggr_wide_dt(small_comb_all) 1.790795 1.910086 2.073120 2.006660 2.096089
##          max neval
## 1   9.510568   100
## 2 104.714960   100
## 3   7.889932   100
## 
## $relative
##                             expr      min       lq     mean   median       uq
## 1 aggr_wide_base(small_comb_all) 2.411064 2.368174 2.250053 2.282504 2.248123
## 2 aggr_wide_tidy(small_comb_all) 3.327548 3.403952 3.779315 3.383982 3.395771
## 3   aggr_wide_dt(small_comb_all) 1.000000 1.000000 1.000000 1.000000 1.000000
##         max neval
## 1  1.205406   100
## 2 13.271972   100
## 3  1.000000   100
## 
## $plot

Benchmarks for large data:

bnch_aggr_wide_large <- run_benchmark(
  aggr_wide_base(large_comb_all),
  aggr_wide_tidy(large_comb_all),
  aggr_wide_dt(large_comb_all)
)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
bnch_aggr_wide_large
## $benchmark
## Unit: milliseconds
##                            expr       min        lq      mean    median
##  aggr_wide_base(large_comb_all) 227.13527 229.21543 249.78201 230.98965
##  aggr_wide_tidy(large_comb_all)  18.04147  19.06485  19.42071  19.36601
##    aggr_wide_dt(large_comb_all)  16.77064  17.34240  20.76989  17.78978
##         uq       max neval
##  232.86863 470.24675   100
##   19.76823  20.84071   100
##   18.08370 177.43426   100
## 
## $default
##                             expr       min        lq      mean    median
## 1 aggr_wide_base(large_comb_all) 227.13527 229.21543 249.78201 230.98965
## 2 aggr_wide_tidy(large_comb_all)  18.04147  19.06485  19.42071  19.36601
## 3   aggr_wide_dt(large_comb_all)  16.77064  17.34240  20.76989  17.78978
##          uq       max neval
## 1 232.86863 470.24675   100
## 2  19.76823  20.84071   100
## 3  18.08370 177.43426   100
## 
## $relative
##                             expr       min       lq       mean    median
## 1 aggr_wide_base(large_comb_all) 13.543626 13.21705 12.0261590 12.984402
## 2 aggr_wide_tidy(large_comb_all)  1.075777  1.09932  0.9350417  1.088603
## 3   aggr_wide_dt(large_comb_all)  1.000000  1.00000  1.0000000  1.000000
##          uq      max neval
## 1 12.877264 2.650259   100
## 2  1.093152 0.117456   100
## 3  1.000000 1.000000   100
## 
## $plot